In [ ]:
!pip install pyspark
Requirement already satisfied: pyspark in c:\users\user\appdata\local\packages\pythonsoftwarefoundation.python.3.11_qbz5n2kfra8p0\localcache\local-packages\python311\site-packages (3.5.1) Requirement already satisfied: py4j==0.10.9.7 in c:\users\user\appdata\local\packages\pythonsoftwarefoundation.python.3.11_qbz5n2kfra8p0\localcache\local-packages\python311\site-packages (from pyspark) (0.10.9.7)
In [ ]:
from pyspark.sql import *
from pyspark.sql import functions as func
spark = SparkSession.builder.getOrCreate()
In [ ]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("abd") \
.getOrCreate()
fashion_accessories_df = "hdfs://localhost:9000/abd/All_Category/fashion__accessories.txt"
data_storage_df = "hdfs://localhost:9000/abd/All_Category/data_storage.txt"
perfume_cologne_df = "hdfs://localhost:9000/abd/All_Category/perfume__cologne.txt"
automotive_tools_df = "hdfs://localhost:9000/abd/All_Category/automotive__tools.txt"
beauty_personal_care_df = "hdfs://localhost:9000/abd/All_Category/beauty__personal_care.txt"
bath_body_df = "hdfs://localhost:9000/abd/All_Category/bath__body.txt"
shaving_hair_removal_products_df = "hdfs://localhost:9000/abd/All_Category/shaving__hair_removal_products.txt"
handmade_jewellery_df = "hdfs://localhost:9000/abd/All_Category/handmade_jewellery.txt"
kids_babies_df = "hdfs://localhost:9000/abd/All_Category/kids__babies.txt"
luggage_travel_gear_df = "hdfs://localhost:9000/abd/All_Category/luggage__travel_gear.txt"
home_decor_df = "hdfs://localhost:9000/abd/All_Category/home__decor.txt"
pets_df = "hdfs://localhost:9000/abd/All_Category/pets.txt"
handmade_kitchen_dining_df = "hdfs://localhost:9000/abd/All_Category/handmade_kitchen__dining.txt"
outdoor_cooking_df = "hdfs://localhost:9000/abd/All_Category/outdoor__cooking.txt"
men_df = "hdfs://localhost:9000/abd/All_Category/men.txt"
women_df = "hdfs://localhost:9000/abd/All_Category/women.txt"
grocery_df = "hdfs://localhost:9000/abd/All_Category/grocery.txt"
work_safety_df = "hdfs://localhost:9000/abd/All_Category/work_safety.txt"
hobbies_crafts_df = "hdfs://localhost:9000/abd/All_Category/hobbies__crafts.txt"
toys_games_df = "hdfs://localhost:9000/abd/All_Category/toys__games.txt"
# Reading text files
fashion_accessories_text = spark.sparkContext.textFile(fashion_accessories_df)
data_storage_text = spark.sparkContext.textFile(data_storage_df)
perfume_cologne_text = spark.sparkContext.textFile(perfume_cologne_df)
automotive_tools_text = spark.sparkContext.textFile(automotive_tools_df)
beauty_personal_care_text = spark.sparkContext.textFile(beauty_personal_care_df)
bath_body_text = spark.sparkContext.textFile(bath_body_df)
shaving_hair_removal_products_text = spark.sparkContext.textFile(shaving_hair_removal_products_df)
handmade_jewellery_text = spark.sparkContext.textFile(handmade_jewellery_df)
kids_babies_text = spark.sparkContext.textFile(kids_babies_df)
luggage_travel_gear_text = spark.sparkContext.textFile(luggage_travel_gear_df)
home_decor_text = spark.sparkContext.textFile(home_decor_df)
pets_text = spark.sparkContext.textFile(pets_df)
handmade_kitchen_dining_text = spark.sparkContext.textFile(handmade_kitchen_dining_df)
outdoor_cooking_text = spark.sparkContext.textFile(outdoor_cooking_df)
men_text = spark.sparkContext.textFile(men_df)
women_text = spark.sparkContext.textFile(women_df)
grocery_text = spark.sparkContext.textFile(grocery_df)
work_safety_text = spark.sparkContext.textFile(work_safety_df)
hobbies_crafts_text = spark.sparkContext.textFile(hobbies_crafts_df)
toys_games_text = spark.sparkContext.textFile(toys_games_df)
In [ ]:
# Function to explore data
def explore_data(data, title):
print(f"Exploring {title}:")
print("Number of lines:", data.count()) # Count number of lines
print("Sample data:")
for line in data.take(5): # Display sample data
print(line)
print("\n")
# Explore each dataset
explore_data(fashion_accessories_text, "Fashion Accessories")
explore_data(data_storage_text, "Data Storage")
explore_data(perfume_cologne_text, "Perfume & Cologne")
explore_data(automotive_tools_text, "Automotive Tools")
explore_data(beauty_personal_care_text, "Beauty & Personal Care")
explore_data(bath_body_text, "Bath & Body")
explore_data(shaving_hair_removal_products_text, "Shaving & Hair Removal Products")
explore_data(handmade_jewellery_text, "Handmade Jewellery")
explore_data(kids_babies_text, "Kids & Babies")
explore_data(luggage_travel_gear_text, "Luggage & Travel Gear")
explore_data(home_decor_text, "Home Decor")
explore_data(pets_text, "Pets")
explore_data(handmade_kitchen_dining_text, "Handmade Kitchen & Dining")
explore_data(outdoor_cooking_text, "Outdoor Cooking")
explore_data(men_text, "Men")
explore_data(women_text, "Women")
explore_data(grocery_text, "Grocery")
explore_data(work_safety_text, "Work & Safety")
explore_data(hobbies_crafts_text, "Hobbies & Crafts")
explore_data(toys_games_text, "Toys & Games")
Exploring Fashion Accessories: Number of lines: 315017 Sample data: Fruit of the Loom Boys' Eversoft Cotton Undershirts, T Shirts & Tank Tops Hanes Boys' Socks, Double Tough Cushioned Crew Socks, 12-pair Packs The Children's Place Baby Toddler Boys Long Sleeve Oxford Button Down Shirt Minecraft Boys' 6-Piece Snug-fit Cotton Pajamas Set Hanes Boys' Socks, Double Tough Cushioned Ankle and No Show, 12-Pair Packs Exploring Data Storage: Number of lines: 22054 Sample data: Samsung 980 PRO SSD 2TB PCIe NVMe Gen 4 Gaming M.2 Internal Solid State Hard Drive Memory Card, Maximum Speed, Thermal Control, MZ-V8P2T0B, Black WD_BLACK 2TB SN770 NVMe Internal Gaming SSD Solid State Drive - Gen4 PCIe, M.2 2280, Up to 5,150 MB/s - WDS200T3X0E Samsung 970 EVO Plus 2TB NVMe M.2 Internal SSD (MZ-V7S2T0B/AM) [Canada Version] Samsung 970 EVO Plus 1TB NVMe M.2 Internal SSD (MZ-V7S1T0/AM) [Canada Version] Seagate Storage Expansion Card for Xbox Series X|S 2TB Solid State Drive - NVMe Expansion SSD for Xbox Series X|S (STJR2000400) Exploring Perfume & Cologne: Number of lines: 18582 Sample data: Nautica Voyage Eau De Toilette for Men - Fresh, Romantic, Fruity Scent - Woody, Aquatic Notes of Apple, Water Lotus, Cedarwood, and Musk - Ideal for Day Wear - 3.3 Fl Oz Pure Instinct CRAVE Roll-On The Original Pheromone Infused Essential Oil Perfume Cologne – For Her - TSA Ready 0.34 fl oz NIVEA Men Sensitive Skin Cooling After Shave Balm (100mL), Aftershave for Sensitive Skin, No Drying Alcohol, Instantly Soothes & Cools Down Skin After Shaving PB ParfumsBelcam Vault, our version of Armani Code, EDT Spray, 100 ml (Pack of 1) 2 Pcs Pheromones Perfume for Women,Romantic Pheromone Glitter Perfume,Flirty Aroma Lusting Pheromone Perfume,Essential Oil Perfume with Pheromones for Women to Attracting Men. Exploring Automotive Tools: Number of lines: 90159 Sample data: PUREBURG 2-Pack Replacement HEPA Filters Compatible with Therapure TPP240F Fits Envion TPP240 TPP230 Air Purifiers "Happybuy Stainless Steel Cable 3/16""x 500ft, T304 Marine Grade Deck Cable Railing, 7x19 Strands Construction Braided Aircraft Cable for Deck Rail String Lights Hanging Porch Fence DIY Baluster" "HOME STAIRWAY LTD. : Stair Iron Railing - 1/2"" Square Metal Balusters in Satin Black - Box of 10 (Single Collar)" Govee Life Smart Space Heater, Electric Space Heater with Thermostat, Wi-Fi & Bluetooth App Control, Works with Alexa & Google Assistant, 1500W Ceramic Heater for Bedroom, Indoors, Office, Living Room LEVOIT Air Purifiers for Bedroom Home, HEPA Freshener Filter Small Room Cleaner with Fragrance Sponge for Smoke, Allergies, Pet Dander, Odor, Dust Remover, Office, Desktop, Table Top, Core Mini, White Exploring Beauty & Personal Care: Number of lines: 140876 Sample data: H2ofloss Cordless Water Dental Flosser, Portable Oral Irrigator for Teeth, Braces, Rechargeable & IPX7 Waterproof Teeth Cleaner for Home Travel Ionic NanoSteamer - 3-in-1 Facial Steamer with Precise Temp Control - Atomizer - Mist - Humidifier- Unclogs Pores - Blackheads - Spa Quality - NanoSteam beautyblender blendercleanser solid, 1 ounce cleanser Brightup Beard Trimmer for Men, Hair Clippers & Hair Trimmer for Men, IPX7 Waterproof Mustache Face Nose Ear Body Shavers Electric Razor Men, Mens Gifts, USB Rechargeable & LCD Display, FK-8688T Crest 3D White Whitestrips Professional Effects Teeth Whitening Kit, 22 Treatments, 13 Levels Whiter Exploring Bath & Body: Number of lines: 17181 Sample data: Handheld Bath Brush with Long Handle Shower Brush-Soft & Comfortable Dry Skin Body Massage Brush Back Exfoliation Brushes (Blue) Bath Body Brush with Comfy Bristles Non-slip Long Handle Gentle Exfoliation Improve Skin's Health and Beauty, Shower Brush Back Scrubber for Men and Women Relaxing Spa Massage (Pink) Silicone Body Scrubber 2pcs, 2 in 1 Bath and Shampoo Brush with Handle, Exfoliating Bath Body Scrub Brush for Shower High quality dry brush body brush, bath brush 5-piece set, natural bristle long handle bath brush, facial brush, exfoliating bath body brush, foot pumice stone, back cotton linen bath rub, cellulite massage brush for lymphatic detoxification TXV Mart 100% Natural Exfoliating Sisal Bath Gloves Sponge Scrubber Deeply Clean Remove Dead Skin, Bathroom, Shower, Spa - 1 Pair Exploring Shaving & Hair Removal Products: Number of lines: 17769 Sample data: Philips OneBlade Face & Body Kit with Li-Ion Handle, QP2630/21 eos Shea Better Travel Size Shaving Cream, Pomegranate Raspberry, 24HR Hydration, 74ml OOCOME Women 4 In 1 Rechargeable Electric Epilator Hair Shaver Lady'S Electric Trimmer Remover Waterproof Razor For Bikini Area Nose Armpit Arm Leg Eyebrow Razors,Multifunctional Stainless Steel Eyebrow Trimmer for Men and Women, Grooming Shavers/Face Hair Removers/Removal/Shaving Tools Set Charmonic 17.5 Oz Hair Wax Beans , Hard Body Wax Beans, Hair Removal Depilatory Wax European Beads for Women Men 500g/1.1 lb (Chamomile) Exploring Handmade Jewellery: Number of lines: 28666 Sample data: Earrings for Women Spiral threader earrings 14K gold earrings hand bent dangle earrings for women,suitable for gift giving, perfect for your birthday party, Christmas, gift giving. Tiny Nose Ring Hoop 20 G Nose Piercings Hoop - 14K Gold Filled Nose Piercings hoop Smilebelle Evil Eye Necklace Gold 14K Protection Necklace with Zircons as Thanksgiving Gifts, Eye Necklace for Women Handmade Jewelry, Luck Amulet for Protection, Third Eye Necklace Birthday Gift for Her Fake Clip On Nose Ring 24g - 925 Sterling Silver - No Piercing Needed - Fake Nose Hoop Spiral Threader Earrings 925 Sterling Silver Twisted Linear Curved Pull Through Earrings Exploring Kids & Babies: Number of lines: 101988 Sample data: Girls 9 Pack Tagless Hipster Kids Watch, Girls Digital Watch with Alarm/Stopwatch/Distance/Calories/Steps Counter, Watches for Kids Teens Gift for Girls Boys Girls 9 Pack Tagless Brief Girls 6 Pack - Toddler Assortment Lucky Clover Necklace For Women Girls, 18K Gold Plated Cute Fashion Simple Girls Titanium Steel Hypoallergenic Pendant Exploring Luggage & Travel Gear: Number of lines: 21444 Sample data: Secure Travel Money Belt, Undercover Hidden RFID Blocking Travel Wallet, Anti-Theft Passport Wallets for Men Women Cipway - 5 Set Compression Packing Cubes for Travel, Ultralight Packing Organizers for Luggage Suitcase & Backpack (White), L Luggage Sets 3 Piece Softside Expandable Lightweight & Durable Suitcase Sets Double Spinner Wheels TSA Lock (20in/24in/28in) Blue Slim Minimalist Aluminum Wallet for Men/Credit Card Holder for Men with Cash Strap Windproof Travel Umbrella - Wind Resistant, Small - Compact, Light, Automatic, Strong, Mini, Folding and Portable - Backpack, Car, Purse Umbrellas for Rain - Men and Women Exploring Home Decor: Number of lines: 67771 Sample data: Fall Candles, Pumpkin Spice Candles for Home, Autumn Candle, Pumpkin Candle, Fall Scented Candles for Home, Fall Home Decor, Fall Bathroom Decor, Autumn Decor, Hello Pumpkin, Hello Fall - 9oz Kim and Pom Pumpkin Spice Candle, Fall Scent, Fall Candles Get Well Soon Gifts for Women Sympathy Gift Baskets Care Package Self Care gifts for Sick Friends Mom Grandma Wife After Surgery Feel Better Gifts Thinking of You Encouragement Stress Relief Present Birthday Gifts for Women Sunflower Gifts Sunshine Gifts Baskets for Women Gifts for Friends Female Self Care Package Thinking of You Gift Box for Her Sister Boss Lady Inspirational Get Well Soon Gifts Handmade in Canada - Wooden Custom Baby Name Signs for Nursery for Boys & Girls - Choose Size, Font, Color - Baby Room Wall Decor - Newborn Essentials Gift - Wood Letters - Personalized Nursery Decor Exploring Pets: Number of lines: 18724 Sample data: "peepeego Upgrade Non-Slip Dog Pads Extra Large 72"" x 72"", Washable Puppy Pads with Fast Absorbent, Reusable, Waterproof for Training, Travel, Whelping, Housebreaking, Incontinence, for Playpen, Crate" Sure Petcare - SureFeed Microchip Pet Feeder - The Automatic Pet Feeder That Makes Meal Times Stress Free - Helps prevent food stealing - Great for Prescription and Weight Management Diets Pet N Pet Dog Poop Bags 1080 Counts, Green Dog Bags Poop Bag, USDA Certified 38% Biobased Doggy Poop Bags Dog Bag, Durable Dog Waste Bags Dog Poop Bag, Dog Poo Bags, Pet Poop Bags Dogs Poops Bag Purina Pro Plan Veterinary Supplements Dog Supplement, FortiFlora Powdered Canine Probiotic - 30 x 1 g Sachets (1 Pack), Brown Arm & Hammer Clump & Seal Slide Clay Cat Litter, 12.7kg, Odour Control, Dust Free, Clumping Litter Exploring Handmade Kitchen & Dining: Number of lines: 18674 Sample data: Thank You Gifts for Women Spa Thoughtful Unique Gift Basket for Coworkers Nurse Friends Men Boss Employee Secretary Hostess Teacher Mom Her Personalized Whiskey Glasses - Custom Whiskey Gifts for Men - Old Fashion Rocks Scotch Glass - Birthday, Anniversary, Dad, Boyfriend, Husband, Mens Gifts - Christmas Gift for Men - Mens Xmas Gift Off Cut & Co. Premium Board Balm Wax - Premium Canadian Beeswax and Mineral Oil Cutting Board Balm Wax- (3.5 oz/ 100g) Handmade in Canada - Personalized Cutting Board - Unique Wedding Gift Idea for Couples, Anniversary, Bridal Shower, Housewarming - Christmas Gift for Couples - Custom Charcuterie & Cheese Board Personalized Cutting Boards - Wedding Gifts, House Warming Gifts, Anniversary Gifts for Her & Him, Couples Engagement Gifts - Cheese & Charcuterie Board - Personalized Gifts for Men, Women & Couples Exploring Outdoor Cooking: Number of lines: 16976 Sample data: ThermoPro Waterproof Digital Instant Read Meat Thermometer Kitchen Cooking Food Thermometer with Backlight Steak Oil Fry Candy Thermometer "Grillman Heavy-Duty BBQ Cover, Gas Grill Cover for Weber Spirit, Weber Genesis, Char Broil, Nexgrill. Rip-Proof, Waterproof (58"" L x 24"" W x 48"" H, Black) BBQ Covers" SimpleHouseware 55-inch Waterproof Heavy Duty Gas BBQ Grill Cover, Weather-Resistant Polyester Traeger Grills Signature Blend 100% All-Natural Wood Pellets for Smokers and Pellet Grills. BBQ, Bake, Roast, and Grill, 20 lb. Bag Grill Cover, BBQ Cover 58 inch,Waterproof BBQ Grill Cover,UV Resistant Gas Grill Cover,Durable and Convenient,Rip Resistant,Black Barbecue Grill Covers,Fits Grills of Weber,Brinkmann,Char-Broil etc (58 Inch) Exploring Men: Number of lines: 23235 Sample data: Polarized Aviator Sunglasses for Men Women Metal Flat Top Sunglasses lightweight Driving UV400 Outdoor 58mm Semi-Rimless Polarized Sunglasses UV Protection Classic Half Frame Sun Glasses Men Women Mens Hooded Sweatshirt Men's Regular-Fit Long-Sleeve Solid Shirt, Black, Large Mens Classic Relaxed Fit Stretch Cargo Short Exploring Women: Number of lines: 25349 Sample data: Winter Gloves Womens 100% Genuine Leather Touchscreen Warm Driving Gloves Premium Orthopedic Open Toe Sandlas Anti-Slip Ladies Wedge Faux Leather Sandals Summer Hook and Loop Comfy Sandals Casual Beach Sandals,Brown,US6/EU37 womens Marl Slub Slouch Boot Sock, 4 Pair Pack Catholic Pink Crystal Beads Gold Rosary Flowers Beaded Necklace Holy Mary Heart Locket Medal & Cross Religious Amulet for Women, Crystal, No Gemstone 8 Pairs Clip on Earrings for Women Dangling Cross Butterfly Star Pearl Clip Dangle Earrings Set Hypoallergenic Clip Long Earrings Non Pierced Piercing Jewelry Silver Gold Tone Exploring Grocery: Number of lines: 22912 Sample data: 1LB. 100% Hawaii Hawaiian Kona Extra Fancy Coffee Beans HERSHEY'S Unsweetened Cocoa Powder for Baking, Chocolate Powder, Gluten Free, 652g - Online Exclusive 1LB. 100% Jamaican Blue Mountain Roasted Coffee Maynards, Assorted Gummy Candy (Pack of 90), Sour Patch Kids, Fuzzy Peach, Swedish Berries, Swedish Fish, Bulk Candy, Individually Wrapped, Sour Candy, Halloween Candy, 1.12 kg MARS Variety, Halloween Chocolate Candy Bars, Assorted Fun Size Bars, Bulk Box, 120 Count Exploring Work & Safety: Number of lines: 16684 Sample data: Mens Ripstop Men's Multi-Cargo Scrub Pant Tuffo unisex baby overalls and coveralls workwear apparel, Blue, 4T Pack of 1 US White Lab Coats Doctor Workwear - Unisex Lab Coat Scrubs for Woman and Man Venom Steel Heavy Duty Breathable Coverall L/XL, White Adjustable Working Cap with Button, Cotton Working Hat Sweatband, Elastic Bandage Tie Back Hats for Women & Men, One Size Exploring Hobbies & Crafts: Number of lines: 16802 Sample data: Keadic 9Pcs Gundam Model Tools Kit Hobby Building Tools Craft Set for Basic Model Building, Repairing and Fixing LIFEGOO Gundam Model Tools Kit, 42 in 1 Modeler Basic Tools Craft Set Hobby Building Tools Kit with Tool Case Perfect for Gundam Model Building Repairing and Fixing Magnifier with light and stand with 18 LEDs, 10 frames 30 frames foldable reading magnifier - 3 lighting modes, dimmable large glass hand crank magnifier Desktop magnifier for the elderly, children, reading, inspection, hobby - with lens cloth Bandai Hobby - Mobile Suit Gundam - HG 1/144 Gundam Barbatos Lupus Model Kit XINMEIWEN Pieces Shelves Tool Stand Holder Model Production Tools Placement Rack Plastic Holder Container Organizer for Gundam Hobby Model Making Parts (29x18x12cm) Exploring Toys & Games: Number of lines: 18990 Sample data: iHaHa Fire Truck Toys for 1 2 3 4 5 6 Years Old Boys Toddler, 5 in 1 Kids Carrier Toy Birthday, Car Friction Power Toys with Light Sound Magnetic Tiles Kids Toys STEM Magnet Toys for Toddler Magnetic Blocks Building Toys Preschool Learning Sensory Montessori Toys for 3+ Year Old Boys and Girls, Safe Creativity Toddler Kids Toys yiyisibao Magnet Toys for 3 Year Old Boys & Girls, Magnetic Blocks STEM Learning Educational Building Blocks for Kids Ages 4-8, Toddler Toys 40 PCS Montessori Toys for 2 Year Old Boys Drum Set for Kids with 2 Drum Sticks and Microphone, Musical Toys Gift for Toddlers… Kiddiworld Dinosaur Toys for 3 4 5 Year Old Boys Gifts, Dinosaurs Toys for Kids 3-5-7, Dino Figures Activity Play Mat Christmas Birthday Gifts for Girls Toddler Toys Age 2-4
In [ ]:
# Calculate total number of lines
total_lines = (
fashion_accessories_text.count() +
data_storage_text.count() +
perfume_cologne_text.count() +
automotive_tools_text.count() +
beauty_personal_care_text.count() +
bath_body_text.count() +
shaving_hair_removal_products_text.count() +
handmade_jewellery_text.count() +
kids_babies_text.count() +
luggage_travel_gear_text.count() +
home_decor_text.count() +
pets_text.count() +
handmade_kitchen_dining_text.count() +
outdoor_cooking_text.count() +
men_text.count() +
women_text.count() +
grocery_text.count() +
work_safety_text.count() +
hobbies_crafts_text.count() +
toys_games_text.count()
)
print("Total number of lines across all datasets:", total_lines)
Total number of lines across all datasets: 1019853
In [ ]:
# Define ignored words
ignored_words = ['men', 'women', "women's", "men's", 'mens', 'womens', 'for', 'and', 'or', 'x', 'X', 'the', 'in', 'of', 'with', 'on', 'at', 'by', 'to', 'from', 'as', 'is', 'are', 'it', 'be', 'that', 'which', 'this', 'where', 'when', 'how', 'so', 'also', 'will', 'has', 'have', 'but', 'not', 'no', 'may', 'yet', 'you', 'we', 'i', 'a', 'an', '(', ')', '[', ']', '{', '}', ',', '.', ';', ':', '-', '_', '/', '\\', '!', '?', '"', "'", '*', '&', '%', '$', '@', '+', '=', '<', '>', '|']
# Function to perform word count while filtering out ignored words
def word_count(line):
words = line.lower().split()
# Remove ignored words
filtered_words = [word for word in words if word not in ignored_words]
return [(word, 1) for word in filtered_words]
In [ ]:
# Word count for each text file
word_counts = {
'fashion_accessories': fashion_accessories_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'data_storage': data_storage_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'perfume_cologne': perfume_cologne_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'automotive_tools': automotive_tools_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'beauty_personal_care': beauty_personal_care_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'bath_body': bath_body_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'shaving_hair_removal_products': shaving_hair_removal_products_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'handmade_jewellery': handmade_jewellery_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'kids_babies': kids_babies_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'luggage_travel_gear': luggage_travel_gear_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'home_decor': home_decor_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'pets': pets_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'handmade_kitchen_dining': handmade_kitchen_dining_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'outdoor_cooking': outdoor_cooking_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'men': men_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'women': women_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'grocery': grocery_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'work_safety': work_safety_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'hobbies_crafts': hobbies_crafts_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
'toys_games': toys_games_text.flatMap(word_count).reduceByKey(lambda x, y: x + y)
}
In [ ]:
# Display top 100 words for each dataset
for category, word_count_rdd in word_counts.items():
print(f"\nTop 100 words for {category}:")
top_100_words = word_count_rdd.takeOrdered(100, key=lambda x: -x[1])
for word, count in top_100_words:
print(f"{word}: {count}")
Top 100 words for fashion_accessories: watch: 58279 earrings: 33891 girls: 31089 leather: 30158 bag: 27880 jewelry: 27154 necklace: 26727 silver: 24525 black: 21789 gold: 20943 bracelet: 20411 strap: 19887 steel: 19671 gift: 17853 dress: 17513 stainless: 17426 casual: 17176 shoes: 17018 gifts: 16307 fashion: 16274 sleeve: 16148 band: 15821 set: 15744 shoulder: 15198 long: 15115 sterling: 13260 chain: 13072 pendant: 12905 purse: 12855 size: 12840 hat: 12591 quartz: 12588 wedding: 12498 crossbody: 11988 ring: 11735 party: 11120 adjustable: 10910 boots: 10166 waterproof: 10068 sunglasses: 9348 vintage: 9277 watches: 9269 short: 9150 2: 8958 toe: 8758 cotton: 8410 heart: 8264 crystal: 8260 high: 8185 white: 8135 pocket: 8117 shirt: 8060 handbag: 8005 summer: 7971 genuine: 7954 classic: 7895 small: 7886 birthday: 7886 boot: 7769 top: 7714 tote: 7657 women,: 7490 cute: 7470 lightweight: 7376 ladies: 7345 belt: 7309 soft: 7266 neck: 7202 winter: 7173 dangle: 7152 925: 7118 rings: 7109 bracelets: 7102 fit: 7043 watch,: 7031 wallet: 7021 one: 7010 cap: 6989 plated: 6881 hoop: 6851 bags: 6684 stud: 6662 pants: 6654 buckle: 6623 kids: 6619 socks: 6436 wrist: 6407 round: 6358 3: 6335 pack: 6304 shoe: 6285 sports: 6186 wide: 6133 large: 6096 beach: 5932 unisex: 5925 big: 5654 accessories: 5590 slip: 5576 blue: 5565 Top 100 words for data_storage: drive: 19342 usb: 15520 flash: 8621 hard: 6645 memory: 5909 stick: 5409 ssd: 5064 external: 4231 internal: 3328 thumb: 3322 storage: 3320 sata: 3215 3.0: 3048 portable: 3006 solid: 2817 state: 2806 m.2: 2636 drive,: 2556 2.0: 2531 up: 2241 nvme: 2240 hdd: 2193 drives: 2143 card: 1989 disk: 1961 data: 1930 compatible: 1916 speed: 1825 pen: 1729 64gb: 1726 pcie: 1722 pack: 1719 1tb: 1694 pc: 1689 32gb: 1662 black: 1644 gb: 1617 high: 1508 128gb: 1502 desktop: 1433 1: 1419 laptop: 1369 2: 1334 c: 1326 pro: 1309 2tb: 1278 10: 1267 inch: 1229 ultra: 1123 2.5: 1082 seagate: 1000 jump: 982 16gb: 971 gen: 966 iii: 962 metal: 950 3d: 946 adapter: 902 3: 900 3.2: 897 sd: 894 256gb: 892 4: 891 –: 859 swivel: 845 2280: 844 photo: 826 nand: 823 sandisk: 822 computer: 811 bulk: 809 2.5"": 808 digital: 801 pc,: 786 3.1: 765 cache: 759 6gb/s: 745 type: 743 wd: 740 read: 739 backup: 735 tb: 722 dual: 708 512gb: 703 5: 700 class: 700 blue: 694 android: 689 mac: 681 500gb: 675 samsung: 668 pendrive: 664 4tb: 658 nas: 657 design: 644 performance: 640 ssd,: 623 mb/s: 603 xbox: 603 mini: 602 Top 100 words for perfume_cologne: perfume: 10522 de: 6145 spray: 5823 eau: 5742 oil: 4847 essential: 3671 fragrance: 3626 lasting: 3288 parfum: 3156 oz: 3133 long: 3046 bottle: 2540 perfume,: 2480 toilette: 2088 solid: 2085 spray,: 1967 pheromone: 1913 3.4: 1869 ml: 1841 cologne: 1740 travel: 1637 portable: 1440 bag: 1371 women,: 1278 storage: 1277 body: 1241 edt: 1058 attar: 1052 gift: 1042 men,: 996 ounce: 949 light: 913 scent: 895 case: 853 set: 835 edp: 826 100: 811 balm: 800 ounces: 800 natural: 792 100ml: 777 floral: 776 oils: 684 pack: 665 organizer: 657 50ml: 653 1.7: 643 unisex: 642 pcs: 624 refreshing: 622 mini: 597 free: 581 1: 565 daily: 561 attract: 544 al: 542 bottles: 538 mist: 529 3: 519 6ml: 517 box: 512 pure: 497 alcohol: 475 perfumes: 460 fruity: 460 aalam: 456 pocket: 449 flower: 438 musk: 431 holder: 427 glass: 426 2: 424 fl: 414 aromatherapy: 400 4: 397 black: 397 fresh: 397 fragrances: 392 3.3: 391 2pcs: 387 rose: 384 dating: 383 elegant: 379 30ml: 368 carrying: 361 suitable: 361 empty: 359 premium: 358 fomiyes: 357 pheromones: 355 nimal: 349 2.5: 346 100%: 339 uses: 335 shave: 333 fragrance,: 333 luxury: 330 refillable: 326 white: 325 oil,: 319 Top 100 words for automotive_tools: door: 12232 steel: 11203 air: 8216 bit: 7982 drill: 7775 inch: 7585 replacement: 7529 pack: 6778 2: 6593 set: 6458 black: 6315 bearing: 6123 ball: 5664 filter: 5319 kit: 5006 heavy: 4744 tool: 4710 led: 4695 stainless: 4687 duty: 4640 rubber: 4639 4: 4632 compatible: 4496 shank: 4486 1: 4319 router: 4239 wall: 4067 car: 3896 bearings: 3895 wood: 3830 pcs: 3794 double: 3691 metal: 3668 light: 3642 3: 3447 white: 3429 bits: 3408 belt,: 3401 carbide: 3400 tape: 3325 adhesive: 3284 hole: 3268 high: 3258 cabinet: 3225 lock: 3217 powerdrive: 3180 round: 2909 furniture: 2815 uxcell: 2768 groove: 2751 10: 2706 cover: 2683 cutting: 2653 6: 2621 floor: 2531 belt: 2450 kitchen: 2404 v: 2359 home: 2357 waterproof: 2336 deep: 2303 aluminum: 2276 plastic: 2258 hooks: 2227 screw: 2220 cutter: 2218 bathroom: 2200 carbon: 2179 length: 2164 5: 2134 2pcs: 2133 diameter: 2101 cnc: 2094 thread: 2089 self: 2085 wire: 2082 hand: 2081 end: 2074 seal: 2065 front: 2040 holder: 2027 garage: 2024 adjustable: 1997 repair: 1981 chrome: 1958 saw: 1954 tile: 1947 roller: 1929 bore: 1924 window: 1900 set,: 1887 1/2"": 1883 vent: 1869 machine: 1866 switch: 1845 linear: 1839 towel: 1823 speed: 1795 tools: 1780 diy: 1778 Top 100 words for beauty_personal_care: hair: 52892 face: 18706 skin: 16922 makeup: 16242 ml: 12965 body: 12732 lip: 12525 cream: 11881 eye: 11060 oil: 10959 natural: 10297 de: 9790 oz: 9427 eau: 8749 tattoo: 8733 spray: 8672 black: 8373 brush: 8009 set: 7811 long: 7289 pack: 6537 2: 6507 perfume: 6445 gel: 6398 1: 6134 girls: 5987 waterproof: 5913 facial: 5854 dry: 5781 care: 5716 professional: 5596 eyebrow: 5573 lash: 5547 eyelash: 5537 nail: 5524 pcs: 5505 kit: 5451 powder: 5134 shampoo: 4990 liquid: 4901 color: 4814 lasting: 4778 serum: 4738 clips: 4737 free: 4710 3: 4692 mask: 4604 all: 4530 accessories: 4409 parfum: 4404 dark: 4243 100ml: 4127 matte: 4125 beauty: 4090 gift: 4086 soft: 4066 moisturizing: 4057 party: 4020 wig: 3949 temporary: 3948 vegan: 3942 4: 3931 lotion: 3903 travel: 3867 lashes: 3859 halloween: 3850 fragrance: 3811 tattoos: 3801 styling: 3739 comb: 3706 fl: 3699 vitamin: 3573 lipstick: 3514 6: 3469 eyeshadow: 3438 hydrating: 3431 balm: 3409 –: 3385 toilette: 3377 100: 3295 glitter: 3288 hair,: 3286 (pack: 3278 remover: 3267 cream,: 3256 50ml: 3213 tool: 3199 eyeliner: 3193 foundation: 3187 skin,: 3166 up: 3162 mascara: 3161 conditioner: 3059 fake: 3053 stick: 3042 50: 3042 kids: 3032 organic: 2988 extension: 2947 extensions: 2947 Top 100 words for bath_body: bath: 8119 shower: 4369 body: 4310 soap: 3695 baby: 3679 exfoliating: 1989 hair: 1947 wash: 1728 skin: 1672 natural: 1655 brush: 1583 scrubber: 1544 toys: 1523 towel: 1493 sponge: 1464 oz: 1421 cap: 1375 soft: 1371 pack: 1360 deodorant: 1290 kids: 1194 toy: 1192 back: 1185 hand: 1169 bathtub: 1132 2: 1049 wipes: 1032 spa: 987 bar: 985 loofah: 860 towels: 847 3: 840 pcs: 835 face: 830 long: 827 sweat: 759 rubber: 745 set: 726 4: 724 soap,: 720 (pack: 711 gift: 688 bonnet: 687 duck: 676 pillow: 668 oil: 659 mesh: 642 6: 633 dry: 629 cotton: 618 bathroom: 587 bathing: 578 silicone: 573 ounce: 565 water: 559 1: 553 suction: 547 wipe: 545 shampoo: 539 tub: 539 organic: 538 satin: 534 caps: 516 scrub: 513 home: 510 wet: 510 girls: 509 black: 507 100%: 507 ml: 506 hat: 499 bag: 499 large: 496 cleaning: 495 gloves: 492 gel: 489 massage: 485 silk: 483 travel: 482 sleep: 474 free: 461 pure: 457 care: 457 absorbent: 457 net: 451 white: 450 reusable: 447 toddler: 445 all: 435 hooded: 432 waterproof: 426 gifts: 413 kitchen: 411 infant: 403 neck: 403 underarm: 401 boys: 401 cleansing: 399 pads: 394 handle: 387 Top 100 words for shaving_hair_removal_products: hair: 10783 beard: 5766 razor: 5582 shaver: 5244 electric: 4867 removal: 4099 wax: 4066 shaving: 3679 trimmer: 3342 tweezers: 3018 nose: 2181 eyebrow: 2131 replacement: 2036 brush: 2007 portable: 1942 facial: 1884 tool: 1800 head: 1789 steel: 1687 stainless: 1641 remover: 1513 travel: 1507 kit: 1506 face: 1495 body: 1474 rechargeable: 1396 mini: 1364 set: 1289 1: 1276 blades: 1230 shave: 1225 professional: 1089 blade: 1081 grooming: 1066 men,: 1027 usb: 1002 series: 985 painless: 972 home: 957 waxing: 948 razors: 943 safety: 941 epilator: 931 eyelash: 925 case: 909 double: 901 bowl: 895 3: 894 cream: 888 2: 825 shavers: 822 tweezer: 807 philips: 805 edge: 771 foil: 765 skin: 763 waterproof: 756 wet: 755 soap: 725 pcs: 714 depilatory: 707 shaver,: 704 tools: 703 trimmer,: 689 use: 684 beauty: 678 comb: 671 mustache: 670 norelco: 658 braun: 657 dry: 652 precision: 645 holder: 641 black: 633 handle: 631 5: 623 crystal: 622 bikini: 620 compatible: 619 storage: 606 stand: 585 machine: 585 pocket: 578 heads: 569 fomiyes: 565 cleaning: 564 care: 563 strips: 561 power: 559 4: 558 cordless: 558 warmer: 547 pack: 545 scissors: 531 easy: 526 oil: 517 razor,: 508 women,: 507 legs: 504 barber: 502 Top 100 words for handmade_jewellery: ring,: 16262 silver: 13328 sterling: 9417 gift: 9093 jewelry: 8818 ring: 8783 earrings: 8082 handmade: 7405 gold: 6533 necklace: 6208 925: 6156 gemstone: 4949 pendant: 4038 bracelet: 3593 rings: 3420 birthstone: 3119 blue: 2905 stone: 2862 natural: 2789 necklace,: 2756 plated: 2610 statement: 2420 pendant,: 2340 jewelry,: 2310 gifts: 2251 black: 2222 earrings,: 2153 her: 2117 solid: 2014 wedding: 1920 crystal: 1898 charm: 1809 box: 1806 14k: 1748 fashion: 1702 bracelet,: 1627 green: 1598 jewellery: 1588 rose: 1568 turquoise: 1554 band: 1547 dainty: 1486 day: 1447 white: 1405 onyx: 1402 •: 1373 pin: 1366 birthday: 1351 women,: 1350 unique: 1343 stud: 1333 boho: 1318 custom: 1271 oval: 1271 girls: 1254 anemone: 1229 quartz: 1209 set: 1198 designer: 1179 dangle: 1178 chain: 1144 healing: 1141 veracity: 1140 heart: 1135 pink: 1131 round: 1122 drop: 1077 amethyst: 1064 engagement: 1019 18k: 1011 diamond: 989 christmas: 987 gift,: 984 moonstone: 982 size: 978 red: 945 adjustable: 934 anniversary: 912 glass: 912 promise: 907 shape: 905 vintage: 881 yellow: 866 flower: 865 wish: 848 hoop: 838 silver,: 836 personalized: 780 nose: 762 mothers: 759 rings,: 758 bar: 755 pearl: 751 genuine: 750 name: 743 bead: 741 cute: 737 style: 727 her,: 723 beads: 721 Top 100 words for kids_babies: baby: 43808 girls: 34185 kids: 27612 boys: 25838 toddler: 17796 toys: 15452 set: 10789 sleeve: 9901 toy: 9867 girl: 9511 long: 8903 infant: 7708 cotton: 7647 gift: 7295 shoes: 6895 winter: 6706 years: 6594 boy: 6592 2: 6358 newborn: 6318 dress: 6295 3: 6120 clothes: 6116 gifts: 6030 soft: 5963 birthday: 5733 months: 5684 socks: 5361 pants: 5331 pack: 4983 toddlers: 4583 unisex: 4519 warm: 4430 hat: 4354 bath: 4338 6: 4321 necklace: 4284 cute: 4274 children: 4051 party: 4019 1: 3956 short: 3948 sensory: 3942 shirt: 3894 old: 3882 year: 3748 outfits: 3658 fleece: 3617 t-shirt: 3599 summer: 3595 outfit: 3593 romper: 3541 learning: 3508 jacket: 3487 top: 3482 christmas: 3444 little: 3383 wooden: 3336 4: 3325 hooded: 3267 up: 3201 black: 3197 tops: 3191 jewelry: 3188 montessori: 3169 school: 3117 educational: 3062 shorts: 2949 size: 2916 casual: 2775 12: 2765 sweatshirt: 2725 toys,: 2703 5: 2700 bodysuit: 2690 hoodie: 2655 babies: 2639 play: 2617 princess: 2588 cartoon: 2573 print: 2551 white: 2547 halloween: 2519 piece: 2479 coat: 2478 adjustable: 2477 silicone: 2386 suit: 2367 waterproof: 2339 activity: 2329 one: 2329 bracelet: 2315 age: 2307 rubber: 2279 boots: 2257 color: 2224 car: 2199 silver: 2187 solid: 2145 animal: 2116 Top 100 words for luggage_travel_gear: bag: 13110 travel: 6056 backpack: 4185 wallet: 3529 leather: 3401 bags: 3248 shoulder: 3244 luggage: 3035 purse: 3021 tote: 2995 crossbody: 2588 keychain: 2565 gifts: 2313 card: 2238 gift: 2237 large: 1800 holder: 1785 key: 1721 small: 1631 laptop: 1583 handbag: 1544 bag,: 1537 pack: 1505 rfid: 1366 black: 1348 waterproof: 1335 girls: 1323 suitcase: 1201 inch: 1163 school: 1100 canvas: 1090 shopping: 1087 strap: 1083 power: 1048 mini: 1045 lightweight: 1041 pocket: 1000 case: 969 cute: 968 organizer: 921 set: 920 2: 918 kids: 908 zipper: 901 women,: 899 blocking: 874 credit: 868 adjustable: 859 clutch: 836 carry: 828 gym: 820 adapter: 818 slim: 813 birthday: 806 hiking: 798 duffel: 797 pouch: 793 chain: 774 handbags: 760 coin: 742 one: 738 umbrella: 720 packing: 714 sling: 704 reusable: 691 daypack: 671 storage: 669 casual: 668 pu: 655 beach: 653 business: 652 messenger: 646 size: 644 purses: 644 dc: 620 id: 617 car: 603 black,: 598 grocery: 589 genuine: 586 converter: 581 belt: 579 sports: 575 pcs: 569 3: 565 tags: 564 christmas: 563 bifold: 556 work: 553 portable: 552 keyring: 550 fashion: 545 clear: 542 cover: 537 phone: 537 wallets: 535 duffle: 534 usb: 533 ring: 530 supply: 520 Top 100 words for home_decor: wall: 19317 print: 18490 art: 18272 light: 16444 led: 15467 bulb: 10271 decor: 10095 gift: 9929 white: 9264 home: 8497 poster: 8131 bulbs: 6980 pack: 6335 handmade: 5697 sign: 5514 warm: 5206 bulb,: 4571 unframed: 4468 painting: 4426 christmas: 4278 bulbs,: 4018 name: 3938 dimmable: 3753 skyline: 3742 house: 3618 personalized: 3583 custom: 3518 halogen: 3368 candle: 3348 inspired: 3258 personalised: 3160 black: 3138 base: 3122 canvas: 3113 poster,: 2990 2: 2973 lamp: 2964 modern: 2921 metal: 2847 clock: 2753 room: 2741 watt: 2691 vintage: 2649 edison: 2640 dog: 2593 equivalent: 2555 gifts: 2553 art,: 2499 print,: 2355 decoration: 2350 glass: 2322 (unframed): 2312 energy: 2283 original: 2219 watercolor: 2206 wood: 2202 address: 2176 gift,: 2144 plaque: 2136 replacement: 2125 acrylic: 2081 screw: 2052 decor,: 2046 3: 2043 wax: 2020 e26: 2016 6: 2008 box: 1994 4: 1986 lighting: 1980 wooden: 1919 clear: 1917 frame: 1908 wedding: 1840 set: 1831 number: 1827 tree: 1772 2700k: 1766 nursery: 1741 white,: 1730 base,: 1728 fine: 1708 saving: 1707 quote: 1702 door: 1686 design: 1676 family: 1670 filament: 1666 scented: 1655 10: 1631 small: 1629 e27: 1626 baby: 1605 framed: 1579 daylight: 1577 fluorescent: 1570 equivalent,: 1558 cool: 1553 inch: 1501 soy: 1452 Top 100 words for pets: dog: 13648 cat: 8139 pet: 5736 dogs: 4029 small: 2789 food: 2415 large: 2373 cats: 2192 toys: 1883 puppy: 1706 natural: 1669 medium: 1664 toy: 1615 water: 1604 pack: 1527 collar: 1306 dogs,: 1302 fish: 1287 training: 1239 2: 1236 chew: 1124 dry: 937 chicken: 934 indoor: 932 adjustable: 926 litter: 914 pets: 914 grooming: 913 aquarium: 910 bed: 896 soft: 892 interactive: 885 treats: 883 3: 836 1: 804 kitten: 797 cats,: 770 bird: 752 mat: 732 bag: 714 –: 702 free: 690 4: 670 waterproof: 669 food,: 669 treat: 656 hair: 651 ball: 637 bowl: 633 adult: 628 seat: 620 tank: 611 brush: 609 all: 606 kg: 597 fountain: 595 feeder: 583 car: 575 long: 542 easy: 538 cleaning: 538 durable: 536 extra: 532 flea: 522 toys,: 518 light: 516 wet: 516 up: 501 filter: 498 100%: 493 scratching: 492 g: 488 washable: 479 premium: 476 black: 469 safety: 467 6: 464 plush: 457 (pack: 456 steel: 455 stainless: 453 dental: 451 toy,: 445 bags: 445 skin: 443 spray: 440 outdoor: 433 box: 430 animal: 426 leash: 416 chews: 411 feeding: 408 set: 407 remover: 399 pad: 397 paw: 395 high: 391 complete: 389 pcs: 388 cover: 386 Top 100 words for handmade_kitchen_dining: gift: 11987 mug: 11274 coffee: 10455 15oz: 8296 11oz: 8159 black: 5195 lovers: 4789 tea: 4702 mug,: 4367 cup: 4127 dog: 3781 color: 3639 gifts: 2954 white: 2692 cup,: 2322 funny: 2321 birthday: 2178 pet: 2040 cat: 1923 oz: 1765 idea: 1702 cake: 1501 blackmug,: 1392 changing: 1370 design: 1328 ceramic: 1317 day: 1317 inner: 1294 accent: 1286 topper,: 1234 kids: 1056 personalized: 1027 12oz: 957 love: 946 custom: 853 my: 822 tasse: 820 cute: 810 quote: 806 ideas: 769 illustration: 767 lover: 765 vintage: 744 unique: 740 11: 738 mom: 727 dad: 726 15: 726 i'm: 699 christmas: 677 fitness: 665 gift,: 657 perfect: 651 scottish: 651 day,: 651 clan: 645 crest: 632 silver: 622 gym: 616 wine: 603 fans: 601 enamel: 584 animal: 567 elegance: 563 atelier: 562 teacher: 555 present: 526 badge: 526 tumbler: 524 school: 512 steel: 510 humor: 500 halloween: 496 anniversary: 495 cutting: 479 old: 473 engraved: 459 mugs: 458 customized: 454 board,: 450 name,: 449 travel: 434 bulldog: 427 topper: 426 proud: 421 wooden: 409 best: 396 name: 389 mugs,: 383 who: 380 14oz: 376 steinless: 375 pride: 372 happy: 372 lovers,: 369 unicorn: 368 gifts,: 365 firefighter: 362 dad,: 360 owners: 357 Top 100 words for outdoor_cooking: grill: 9378 bbq: 7186 steel: 3646 gas: 3502 stainless: 3455 outdoor: 3234 barbecue: 3133 cover: 2272 camping: 2126 replacement: 1907 charcoal: 1853 cooking: 1851 weber: 1801 picnic: 1728 propane: 1715 portable: 1494 heat: 1462 grilling: 1403 set: 1392 smoker: 1390 cooler: 1360 oven: 1347 bag: 1323 burner: 1302 accessories: 1265 waterproof: 1248 2: 1219 wood: 1189 kit: 1103 inch: 1081 griddle: 1032 black: 1004 heavy: 1001 grill,: 999 duty: 993 skewers: 979 grills: 976 fire: 960 pizza: 959 basket: 948 brush: 909 large: 908 pack: 900 meat: 818 handle: 814 kitchen: 804 parts: 795 cover,: 736 4: 734 hose: 727 resistant: 721 3: 717 mat: 677 cleaning: 639 tool: 622 series: 619 pit: 617 insulated: 604 iron: 594 storage: 582 tank: 560 1: 557 rack: 555 genesis: 546 cast: 543 regulator: 543 plate: 523 premium: 519 smoking: 513 silicone: 512 garden: 508 pellet: 507 metal: 494 long: 489 food: 488 travel: 480 electric: 474 spirit: 469 beach: 467 patio: 464 adapter: 462 natural: 456 adjustable: 452 apron: 449 chicken: 448 fits: 448 reusable: 447 chef: 444 grate: 440 ii: 440 top: 435 blackstone: 434 quick: 432 blanket: 430 box: 423 tools: 421 grills,: 420 flat: 417 baking: 415 thermometer: 412 Top 100 words for men: cotton: 2388 casual: 2348 sleeve: 2321 shirt: 2280 socks: 2140 long: 1844 fit: 1779 leather: 1707 shorts: 1595 black: 1401 pants: 1377 short: 1356 hat: 1352 shirts: 1349 jacket: 1293 t-shirt: 1209 watch: 1177 unisex: 1172 cap: 1148 classic: 1147 belt: 1136 pack: 1120 winter: 1106 fleece: 1087 sports: 1015 lightweight: 1007 slim: 987 work: 975 size: 957 soft: 940 shoes: 937 breathable: 866 top: 861 pocket: 858 set: 855 outdoor: 830 warm: 824 underwear: 824 running: 821 pockets: 810 sunglasses: 805 steel: 777 hoodie: 771 adjustable: 755 button: 749 3: 748 boxer: 739 2: 739 neck: 734 zip: 708 dress: 683 waterproof: 682 stretch: 681 summer: 669 gift: 654 trousers: 634 polo: 633 hooded: 632 quick: 621 tie: 612 pairs: 597 strap: 595 vest: 590 sweatshirt: 583 dry: 574 wallet: 565 stainless: 563 up: 557 athletic: 554 elastic: 553 boots: 550 pullover: 548 solid: 547 vintage: 546 card: 545 gym: 538 business: 519 hiking: 510 coat: 510 beach: 508 tops: 502 glasses: 500 men,: 494 briefs: 492 crew: 491 wedding: 490 uk: 488 down: 488 fashion: 483 cargo: 479 safety: 477 protection: 468 6: 458 golf: 453 100%: 450 sport: 448 thermal: 443 wide: 442 t: 439 big: 435 Top 100 words for women: earrings: 4409 long: 3429 ladies: 3367 sleeve: 3089 dress: 2604 silver: 2340 casual: 2335 high: 2220 socks: 2180 tops: 2154 girls: 2153 set: 2023 necklace: 1991 neck: 1968 gold: 1952 bra: 1854 size: 1812 waist: 1656 top: 1642 jewelry: 1641 soft: 1551 winter: 1509 lace: 1508 sterling: 1446 cotton: 1439 party: 1438 summer: 1367 bracelet: 1324 pants: 1318 short: 1302 gifts: 1228 gift: 1180 pairs: 1178 warm: 1169 up: 1161 hoop: 1131 wedding: 1104 sexy: 1096 black: 1094 vintage: 1021 belt: 1019 pack: 1013 adjustable: 1005 plus: 1004 fashion: 1000 2: 977 v: 955 chain: 948 women,: 948 pockets: 938 shirt: 936 control: 920 heart: 907 hat: 895 shorts: 886 hair: 877 piece: 872 fleece: 861 lightweight: 857 dresses: 856 925: 854 leather: 846 costume: 846 shirts: 835 stud: 833 elastic: 830 tummy: 822 leggings: 817 loose: 817 skirt: 814 wide: 808 shoes: 807 stretch: 798 sunglasses: 798 halloween: 797 3: 793 plated: 772 yoga: 758 uk: 747 one: 747 strap: 743 underwear: 743 tights: 741 sleeveless: 736 beach: 716 fit: 699 jacket: 686 seamless: 678 knit: 671 birthday: 667 bracelets: 666 headband: 663 pendant: 634 floral: 632 dangle: 631 slip: 631 waisted: 631 crystal: 619 glasses: 619 open: 618 Top 100 words for grocery: organic: 6324 tea: 4882 coffee: 2001 natural: 1975 pack: 1945 (pack: 1740 free: 1578 chocolate: 1516 1: 1358 powder: 1201 100%: 1196 gluten: 1195 sugar: 1165 bags: 1109 premium: 1060 black: 1023 count: 1015 g: 1011 oz: 1008 whole: 976 vegan: 925 green: 920 –: 897 tea,: 873 free,: 869 12: 805 vegan,: 798 leaf: 779 bag: 761 2: 760 non-gmo,: 757 grams: 752 protein: 745 dried: 726 certified: 726 pure: 720 white: 694 herbal: 683 fruit: 672 food: 658 1kg: 657 candy: 652 ml: 649 cake: 646 original: 644 6: 643 hot: 632 low: 616 dark: 607 coffee,: 594 mix: 592 24: 588 high: 586 made: 586 milk: 571 3: 567 all: 566 4: 555 gift: 537 ounce: 535 salt: 533 box: 533 ground: 523 loose: 516 20: 508 bulk: 505 quality: 496 red: 490 non-gmo: 488 medium: 485 keto: 484 raw: 478 100: 477 1): 472 coconut: 470 yupik: 466 gourmet: 461 oil: 459 sweet: 453 rice: 452 organic,: 444 vanilla: 439 variety: 437 instant: 437 roast: 435 powder,: 434 kosher,: 433 10: 429 gram: 418 blend: 412 5: 408 8: 406 snack: 405 16: 404 kg: 402 butter: 397 drink: 393 perfect: 390 lemon: 385 roast,: 385 Top 100 words for work_safety: sleeve: 3327 hats: 3053 working: 2870 pants: 2655 scrub: 2474 work: 2382 cap: 2332 casual: 2298 adjustable: 2230 jacket: 2206 chef: 2171 short: 2153 long: 2075 2: 1828 sweatband: 1774 shirt: 1672 tops: 1636 size: 1583 top: 1511 hat: 1495 uniform: 1473 unisex: 1457 fit: 1414 back: 1334 coat: 1333 tie: 1300 bouffant: 1281 buttons: 1280 shirts: 1258 summer: 1251 neck: 1175 pockets: 1166 loose: 1149 elastic: 1069 set: 1063 cargo: 1025 pant: 1017 cotton: 1007 v-neck: 991 button: 975 caps: 972 scrubs: 948 pocket: 939 high: 919 workwear: 839 print: 817 warm: 815 heated: 774 pack: 767 trousers: 764 v: 763 winter: 754 graduation: 750 tactical: 744 solid: 734 vest: 698 military: 689 dress: 676 shorts: 650 printed: 649 plus: 647 outdoor: 645 nurse: 641 heating: 634 waist: 634 lightweight: 625 pieces: 610 head: 605 usb: 604 fashion: 602 soft: 593 kitchen: 589 black: 586 t-shirt: 586 safety: 585 2023: 571 stretch: 571 t: 566 leg: 562 color: 557 blouse: 549 front: 543 hooded: 538 zipper: 527 overalls: 522 one: 517 medical: 499 scarf: 499 cute: 487 relaxed: 481 slim: 479 floral: 469 nursing: 468 (color: 466 pattern: 453 jogger: 445 breathable: 432 outerwear: 428 drawstring: 427 charging: 427 Top 100 words for hobbies_crafts: model: 6762 figure: 3650 funko: 2291 gift: 2210 kit: 2148 scale: 2086 pop!: 2085 vinyl: 2046 kids: 2015 toys: 1961 collectable: 1925 official: 1888 idea: 1838 merchandise: 1821 adults: 1701 fans: 1612 car: 1280 card: 1214 collectors: 1171 set: 1162 rc: 1091 plastic: 1076 display: 1069 scissors: 1038 cutting: 884 metal: 817 hobby: 745 gundam: 729 diecast: 689 tamiya: 688 toy: 677 mini: 654 bandai: 639 collection: 626 cutter: 608 movies: 586 craft: 578 star: 547 building: 540 paint: 516 miniature: 513 sewing: 498 train: 485 diy: 480 trading: 478 kit,: 476 black: 449 action: 444 anime: 439 airplane: 438 scenery: 433 2: 433 tool: 432 plane: 430 vallejo: 428 pack: 422 tools: 417 cards: 409 tv: 403 mat: 388 acrylic: 385 figures: 383 alloy: 379 aircraft: 378 revell: 359 steel: 358 railway: 355 box: 354 glass: 346 3: 332 models: 331 1: 329 rotary: 325 –: 324 sleeves: 316 ho: 314 red: 308 fabric: 307 4: 306 green: 304 airfix: 304 color: 303 crawler: 296 game: 295 decoration: 294 wars: 292 ship: 287 military: 281 track: 279 grass: 279 games: 275 scissors,: 274 compatible: 272 1/144: 272 binder: 265 white: 261 knife: 261 army: 260 pcs: 258 stainless: 257 Top 100 words for toys_games: toys: 12101 kids: 11581 toy: 7130 girls: 6088 boys: 5928 ages: 5243 3: 4057 old: 3914 set: 3673 4: 3561 gifts: 3554 gift: 3518 birthday: 3222 6: 3215 year: 3166 5: 2867 baby: 2805 2: 2795 age: 2705 play: 2700 toddler: 2603 learning: 2594 educational: 2534 building: 2452 toddlers: 2428 up: 2301 kit: 2167 game: 2159 8: 2143 7: 2104 years: 2083 kids,: 1867 party: 1750 car: 1747 toys,: 1691 wooden: 1679 stem: 1596 1: 1573 montessori: 1305 sensory: 1290 games: 1283 12: 1282 9: 1236 10: 1190 blocks: 1165 puzzle: 1145 adults: 1127 girl: 1120 toy,: 1095 board: 1091 remote: 1087 dinosaur: 1085 pretend: 1084 animal: 1083 plush: 1080 preschool: 1059 –: 1056 pcs: 1036 rc: 1013 magnetic: 1004 puzzles: 999 mini: 993 pack: 989 control: 980 set,: 974 christmas: 973 girls,: 971 water: 968 fun: 937 boy: 933 bath: 919 outdoor: 898 diy: 893 light: 855 fidget: 824 soft: 813 children: 791 stuffed: 768 construction: 753 3+: 751 ball: 746 kitchen: 736 months: 724 science: 710 electric: 708 led: 673 doll: 664 truck: 656 cars: 655 crafts: 653 11: 644 pool: 638 toddlers,: 634 activity: 620 playset: 613 travel: 599 model: 583 robot: 583 action: 571 kits: 568
In [ ]:
# Define the output directory in HDFS
output_directory = "hdfs://localhost:9000/abd/wordcount_results/"
# Save word count results for each dataset to HDFS
for category, word_count_rdd in word_counts.items():
# Construct the output path for the current category
output_path = output_directory + category
# Save the word count results to HDFS
word_count_rdd.map(lambda x: f"{x[0]}: {x[1]}").saveAsTextFile(output_path)
In [ ]:
# Define the directory where word count results are saved
wordcount_directory = "hdfs://localhost:9000/abd/wordcount_results/"
# Read a sample of the word count results for each dataset from HDFS
for category in word_counts.keys():
# Construct the path to the directory containing word count results for the current category
category_directory = wordcount_directory + category
# Read the first few lines from one of the files in the directory
sample = spark.read.text(category_directory).head(5)
# Print the sample for the current category
print(f"\nSample of word count results for {category}:")
for row in sample:
print(row['value'])
Sample of word count results for fashion_accessories: eversoft: 28 cotton: 8410 shirts: 4713 double: 2966 crew: 2506 Sample of word count results for data_storage: pro: 1309 ssd: 5064 pcie: 1722 nvme: 2240 4: 891 Sample of word count results for perfume_cologne: nautica: 36 eau: 5742 romantic,: 5 notes: 113 apple,: 11 Sample of word count results for automotive_tools: replacement: 7529 filters: 1502 tpp240f: 1 fits: 1476 envion: 5 Sample of word count results for beauty_personal_care: cordless: 593 water: 2407 flosser,: 11 rechargeable: 728 ipx7: 90 Sample of word count results for bath_body: brush: 1583 brush-soft: 2 dry: 629 body: 4310 exfoliation: 79 Sample of word count results for shaving_hair_removal_products: philips: 805 face: 1495 body: 1474 li-ion: 9 handle,: 92 Sample of word count results for handmade_jewellery: earrings: 8082 threader: 29 hand: 265 bent: 2 women,suitable: 1 Sample of word count results for kids_babies: girls: 34185 tagless: 40 kids: 27612 watch,: 187 digital: 511 Sample of word count results for luggage_travel_gear: belt,: 27 hidden: 66 rfid: 1366 5: 238 set: 920 Sample of word count results for home_decor: fall: 253 spice: 47 scented: 1655 home: 8497 decor,: 2046 Sample of word count results for pets: "peepeego: 1 upgrade: 27 non-slip: 226 dog: 13648 pads: 378 Sample of word count results for handmade_kitchen_dining: thank: 29 gifts: 2954 spa: 1 thoughtful: 12 unique: 740 Sample of word count results for outdoor_cooking: thermopro: 11 digital: 185 instant: 174 read: 14 thermometer: 412 Sample of word count results for men: sunglasses: 805 metal: 383 top: 861 lightweight: 1007 driving: 317 Sample of word count results for women: winter: 1509 100%: 195 genuine: 85 touchscreen: 33 driving: 67 Sample of word count results for grocery: 1lb.: 5 extra: 342 coffee: 2001 beans: 273 hershey's: 27 Sample of word count results for work_safety: ripstop: 136 coveralls: 129 blue,: 232 4t: 2 us: 120 Sample of word count results for hobbies_crafts: keadic: 16 9pcs: 9 tools: 417 hobby: 745 building: 540 Sample of word count results for toys_games: ihaha: 9 truck: 656 toys: 12101 3: 4057 4: 3561
In [ ]:
import matplotlib.pyplot as plt
# Function to plot bar chart for top words
def plot_bar_chart(category, top_words):
plt.figure(figsize=(10, 6))
words = [word for word, _ in top_words]
counts = [count for _, count in top_words]
plt.barh(words, counts, color='skyblue')
plt.xlabel('Frequency')
plt.ylabel('Words')
plt.title(f'Top 10 words for {category}')
plt.gca().invert_yaxis() # Invert y-axis to display highest count at the top
plt.show()
# Plot bar charts for top 10 words in each category
for category, word_count_rdd in word_counts.items():
top_10_words = word_count_rdd.takeOrdered(10, key=lambda x: -x[1])
plot_bar_chart(category, top_10_words)
In [ ]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Function to generate word cloud for top words
def generate_word_cloud(category, top_words):
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(top_words))
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f'Word Cloud for {category}')
plt.show()
# Generate word cloud for top 100 words in each category
for category, word_count_rdd in word_counts.items():
top_100_words = word_count_rdd.takeOrdered(100, key=lambda x: -x[1])
generate_word_cloud(category, top_100_words)